Raw isoform-level counts were obtained from lr-kallisto (v0.51.1) and imported into R as a counts matrix. To ensure robust statistical analysis, transcripts were filtered to retain only those with at least 10 counts in two or more samples.
Differential isoform expression analysis was performed using the DESeq2 (v1.44.0) package. The design matrix incorporated disease status (AD vs. CTRL) as the primary factor while adjusting for sex as a covariate.
Results were annotated by joining transcript IDs to gene symbols using a reference transcript-to-gene mapping file generated from GENCODE v48 human reference genome, transcriptome, and annotated GTF files. Significantly differentially expressed isoforms were filtered by adjusted p-value < 0.05 and fold-change threshold (log2 fold-change ≥ |log2(1.5)|). Volcano plots highlighting significant isoforms were generated in ggplot2 (3.5.3).
Significant isoforms were further explored by plotting normalized counts across conditions and stratified by sex to visualize expression differences. Finally, principal component analysis (PCA) on variance-stabilized transformed counts was performed to assess sample clustering by condition and covariates.
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
## Loading required package: ensembldb
## Loading required package: GenomicFeatures
## Loading required package: AnnotationFilter
##
## Attaching package: 'ensembldb'
## The following object is masked from 'package:stats':
##
## filter
## Loading required package: ggplot2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks ensembldb::filter(), stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::select() masks ensembldb::select(), AnnotationDbi::select()
## ✖ dplyr::slice() masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE, fig.width=12, fig.height=8, fig.align = "center")## R version 4.4.0 (2024-04-24)
## Platform: x86_64-pc-linux-gnu
## Running under: AlmaLinux 8.10 (Cerulean Leopard)
##
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS NETLIB; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] lubridate_1.9.3 forcats_1.0.0
## [3] stringr_1.5.1 dplyr_1.1.4
## [5] purrr_1.1.0 readr_2.1.5
## [7] tidyr_1.3.1 tibble_3.2.1
## [9] tidyverse_2.0.0 knitr_1.50
## [11] ggrepel_0.9.6 ggplot2_3.5.2
## [13] EnsDb.Hsapiens.v86_2.99.0 ensembldb_2.28.0
## [15] AnnotationFilter_1.28.0 GenomicFeatures_1.56.0
## [17] DESeq2_1.44.0 SummarizedExperiment_1.34.0
## [19] MatrixGenerics_1.16.0 matrixStats_1.5.0
## [21] GenomicRanges_1.56.0 GenomeInfoDb_1.40.0
## [23] AnnotationDbi_1.66.0 IRanges_2.38.0
## [25] S4Vectors_0.42.1 Biobase_2.64.0
## [27] BiocGenerics_0.50.0
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.2 bitops_1.0-9 rlang_1.1.6
## [4] magrittr_2.0.3 compiler_4.4.0 RSQLite_2.3.6
## [7] png_0.1-8 vctrs_0.6.5 ProtGenerics_1.36.0
## [10] pkgconfig_2.0.3 crayon_1.5.3 fastmap_1.2.0
## [13] XVector_0.44.0 Rsamtools_2.20.0 rmarkdown_2.29
## [16] tzdb_0.4.0 UCSC.utils_1.0.0 bit_4.0.5
## [19] xfun_0.52 zlibbioc_1.50.0 cachem_1.1.0
## [22] jsonlite_2.0.0 blob_1.2.4 DelayedArray_0.30.1
## [25] BiocParallel_1.38.0 parallel_4.4.0 R6_2.6.1
## [28] bslib_0.9.0 stringi_1.8.3 RColorBrewer_1.1-3
## [31] rtracklayer_1.64.0 jquerylib_0.1.4 Rcpp_1.1.0
## [34] Matrix_1.7-0 timechange_0.3.0 tidyselect_1.2.1
## [37] rstudioapi_0.16.0 dichromat_2.0-0.1 abind_1.4-8
## [40] yaml_2.3.10 codetools_0.2-20 curl_6.4.0
## [43] lattice_0.22-6 withr_3.0.2 KEGGREST_1.44.0
## [46] evaluate_1.0.4 Biostrings_2.72.0 pillar_1.11.0
## [49] generics_0.1.4 RCurl_1.98-1.14 hms_1.1.3
## [52] scales_1.4.0 glue_1.8.0 lazyeval_0.2.2
## [55] tools_4.4.0 BiocIO_1.14.0 locfit_1.5-9.9
## [58] GenomicAlignments_1.40.0 XML_3.99-0.16.1 grid_4.4.0
## [61] colorspace_2.1-1 GenomeInfoDbData_1.2.12 restfulr_0.0.15
## [64] cli_3.6.5 S4Arrays_1.4.0 gtable_0.3.6
## [67] sass_0.4.10 digest_0.6.37 SparseArray_1.4.0
## [70] rjson_0.2.21 farver_2.1.2 memoise_2.0.1
## [73] htmltools_0.5.8.1 lifecycle_1.0.4 httr_1.4.7
## [76] bit64_4.0.5
# read in raw counts matrix
counts <- read.csv("../kallisto_counts/kallisto_isoform_counts_all.csv")
countsKeep transcripts with at least 2 samples that have ≥ 10 counts
# filter counts to keep transcripts with at least 2 samples with counts >= 10
# all columns to integers
counts[ , -1] <- lapply(counts[ , -1], as.integer)
counts_filtered <- counts[rowSums(counts[-1] >= 10)>= 2, ]
counts_filtered# create deseq2 object
# counts matrix
counts_matrix <- as.matrix(counts_filtered[-1])
row.names(counts_matrix) <- counts_filtered$isoform_id# col data
col_data <- tibble(
sample_name = colnames(counts_filtered[-1])) %>%
separate(sample_name,c("sample","condition"),sep="_",remove=FALSE) %>%
mutate(condition=as.factor(condition))
# add other covariates for pca
col_data$sex <- factor(c('male','male','male','female','female','female','male','female'))
#col_data$race <- factor(c('Black','Black', 'White', 'White', 'White','White','Black','Black'))
col_data# add gene names
t2g_file <- readr::read_delim("../../refs/human.t2g", delim = "\t", col_names = c("transcript_id", "gene_id", "gene_symbol", "gene_version", "chr", "start", "end", "strand")) %>%
dplyr::select(transcript_id, gene_symbol)
res_df <- res %>%
as.data.frame() %>%
arrange(padj) %>%
tibble::rownames_to_column("isoform_id") %>%
inner_join(t2g_file, by = join_by(isoform_id == transcript_id)) %>%
dplyr::select(isoform_id, gene_symbol, everything())
res_dfPadj threshold = 0.05 Log2FC threshold = +/- 1.5
# plot sample counts for significant differentially expressed isoforms
significant_isoforms <- res_df %>%
dplyr::filter(padj < 0.05) %>%
pull(isoform_id)
run_plot_counts <- function(dds, iso_id) {
gene_name <- res_df %>%
dplyr::filter(res_df$isoform_id == iso_id) %>%
pull(gene_symbol)
plot_data <- plotCounts(dds, iso_id, intgroup = c("condition", "sex"), returnData = TRUE)
p <- plot_data %>%
ggplot(aes(x=condition, y=count, color = sex)) +
geom_point(position=position_jitter(w=0.1,h=0)) +
ggtitle(paste0(gene_name, " (", iso_id, ")")) +
theme_minimal()
print(p)
}
for (isoform in significant_isoforms) {
run_plot_counts(dds,isoform)
}